Loading the libraries we will use for carrying out the EDA

library(tidyverse)
library(stringr)
library(caret)
library(plotly)
library(ggthemes)
library(GGally)
library(class)
library(e1071)
library(stringr)
library(maps)

Introduction

Image 1: Logo Budweiser

Reading data files

Addressing the missing values in each column

#Cleaning ABV using mean
df_beers_cl0 = df_beers
nr_mean_abv = mean(df_beers_cl0[!is.na(df_beers_cl0$ABV),]$ABV) #Calculate the mean
length_abv = length(df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV) 
if(length_abv > 0){
  df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV = nr_mean_abv      #Replacing Missing value with mean
}

#Cleaning IBU using mean
df_beers_cl1 = df_beers
nr_mean_ibu = mean(df_beers_cl1[!is.na(df_beers_cl1$IBU),]$IBU)
length_ibu = length(df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU)
if(length_ibu > 0){
  df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU = nr_mean_ibu
}


#Cleaning using KnnInpute
##Creating the model K = 20
knn_imp_model <- preProcess(df_beers_cl0 %>% 
                          select(ABV,IBU),
                            method = c("knnImpute"),
                            k = 20,
                            knnSummary = mean)


#Running the model
df_beers_unp <- predict(knn_imp_model, df_beers_cl0,na.action = na.pass)
#The beer data set will be normalized. To de-normalize and get the original data back:
procNames <- data.frame(col = names(knn_imp_model$mean), mean = knn_imp_model$mean, sd = knn_imp_model$std)
for(i in procNames$col){
 df_beers_unp[i] <- df_beers_unp[i]*knn_imp_model$std[i]+knn_imp_model$mean[i] 
}


nr_rows = dim(df_beers)[1]
df_beers  %>% ggplot(aes(x=IBU))+geom_histogram(fill="black",binwidth = 3)+
  labs(title="IBU before performing data cleansing",x="IBU(International bitterness Unit)",y="Observation number")
## Warning: Removed 1005 rows containing non-finite values (`stat_bin()`).

df_beers  %>% ggplot(aes(x=ABV))+geom_histogram(fill="black")+
  labs(title="ABV before performing data cleansing",x="ABV(Alcohol by Volume)",y="Observation number")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (`stat_bin()`).

#Plotting IBU using Mean 
df_beers_cl1  %>% ggplot(aes(x=IBU))+geom_histogram(fill="blue")+
  labs(title="IBU after performing data cleansing",subtitle = "Calculation using Mean",x="IBU(International bitterness Unit)",y="Observation number")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Plotting IBU KNNImputation and ABV after data cleansing
df_beers_unp  %>% ggplot(aes(x=IBU))+geom_histogram(fill="blue",col="black",binwidth = 3)+labs(title="IBU after performing data cleansing",subtitle = "Calculation using KnnImputation",x="IBU(International bitterness Unit)",y="Observation number")

df_beers_unp  %>% ggplot(aes(x=ABV))+geom_histogram(fill="blue",col="black")+labs(title="ABV after performing data cleansing",x="ABV(Alcohol by Volume)",y="Observation number")+geom_density(lwd = 1.2,
               linetype = 2,
               colour = 2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

skewness(df_beers_unp$ABV)
## [1] 0.9698248
kurtosis(df_beers_unp$ABV)
## [1] 1.245737
summary(df_beers_unp$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00100 0.05000 0.05700 0.05977 0.06700 0.12800
cor(df_beers_unp$ABV,df_beers_unp$IBU)
## [1] 0.7390397

How many breweries are present in each state

df_summary = df_breweries_2 %>% group_by(State,Name_State) %>% summarize(NumberBreweries = n())
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
knitr::kable(
  df_summary,
  caption = "Number of Beers by State"
)
Number of Beers by State
State Name_State NumberBreweries
AK Alaska 7
AL Alabama 3
AR Arkansas 2
AZ Arizona 11
CA California 39
CO Colorado 47
CT Connecticut 8
DC District of Columbia 1
DE Delaware 2
FL Florida 15
GA Georgia 7
HI Hawaii 4
IA Iowa 5
ID Idaho 5
IL Illinois 18
IN Indiana 22
KS Kansas 3
KY Kentucky 4
LA Louisiana 5
MA Massachusetts 23
MD Maryland 7
ME Maine 9
MI Michigan 32
MN Minnesota 12
MO Missouri 9
MS Mississippi 2
MT Montana 9
NC North Carolina 19
ND North Dakota 1
NE Nebraska 5
NH New Hampshire 3
NJ New Jersey 3
NM New Mexico 4
NV Nevada 2
NY New York 16
OH Ohio 15
OK Oklahoma 6
OR Oregon 29
PA Pennsylvania 25
RI Rhode Island 5
SC South Carolina 4
SD South Dakota 1
TN Tennessee 3
TX Texas 28
UT Utah 4
VA Virginia 16
VT Vermont 10
WA Washington 23
WI Wisconsin 20
WV West Virginia 1
WY Wyoming 4

Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file

df_beerbre_unp = merge(df_beers_unp,df_breweries_2,by.x = "Brewery_id",by.y = "Brew_ID")

Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare

df_acom_bebrew_1 = df_beerbre_unp %>% group_by(State,Name_State) %>% summarize(Median_ABV = mean(ABV),Median_IBU = mean(IBU))

# df_acom_bebrew_1 %>% ggplot(aes(x=State,color=State))+geom_bar()+labs(title = "Alcohol by Volume",subtitle = "Alcohol by Volume average by State")+coord_flip()

Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer

df_sort_1 = arrange(df_beerbre_unp,desc(ABV)) %>% head(n = 1)
sprintf("The state that has the maximum ABV is %s-%f",df_sort_1$Name_State,df_sort_1$ABV)
## [1] "The state that has the maximum ABV is Colorado-0.128000"

Comment on the summary statistics and distribution of the ABV variable

skewness(df_beerbre_unp$ABV)
## [1] 0.9698248
kurtosis(df_beerbre_unp$ABV)
## [1] 1.245737

Accordint to the histogram we can notice that the data seems normally distributed

df_beerbre_unp %>% ggplot(aes(x=ABV,fill=State))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer

df_beerbre_unp %>% select(IBU,ABV) %>% ggpairs(columnLabels = c("ABV","IBU"))+
  labs(title="Relationship between the bitterness of the beer(IBU) and its alcoholic content(ABV)")

Activity 8, Difference with respect to IBU and ABV IPA and ALE, using KNN

df_beerbre_fil0 = filter(df_beerbre_unp,str_detect(df_beerbre_unp$Style,regex("IPA|ALE",ignore_case = TRUE)))


#We create the column Type for classifying IPA Beer Styles and the Rest
df_beerbre_fil1 = mutate(df_beerbre_fil0,Type= ifelse(str_detect(Style,regex("IPA",ignore_case = TRUE)),"IPA","ALE"))
#We plot the relationship between IBU and ABV by Beer Type
df_beerbre_fil1 %>% ggplot(aes(x=ABV,y=IBU,color=Type))+geom_point()+
  labs(title = "Relationship IBU and AVB",subtitle = "Relationship IBU/AVB by Beer Type")+xlab("Alcohol by Volume (ABV)")+ylab("International Bitterness Unit(IBU)")

#Initializing the model
nr_percentage = 0.7
nr_observations = nrow(df_beerbre_fil1)
nr_k = 5

#Creating the training and Testing dataset
lst_index_beer = sample(nr_observations,round(nr_observations * nr_percentage))
df_train = df_beerbre_fil1[lst_index_beer,]
df_test = df_beerbre_fil1[-lst_index_beer,]

#Running the model
knn_result = knn(df_train[,c(4,5)],df_test[,c(4,5)],df_train$Type,prob = TRUE,k=nr_k)
co_table =  table(knn_result,df_test$Type)
confusionMatrix(co_table)
## Confusion Matrix and Statistics
## 
##           
## knn_result ALE IPA
##        ALE 261  35
##        IPA  42 135
##                                           
##                Accuracy : 0.8372          
##                  95% CI : (0.8008, 0.8693)
##     No Information Rate : 0.6406          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6496          
##                                           
##  Mcnemar's Test P-Value : 0.4941          
##                                           
##             Sensitivity : 0.8614          
##             Specificity : 0.7941          
##          Pos Pred Value : 0.8818          
##          Neg Pred Value : 0.7627          
##              Prevalence : 0.6406          
##          Detection Rate : 0.5518          
##    Detection Prevalence : 0.6258          
##       Balanced Accuracy : 0.8278          
##                                           
##        'Positive' Class : ALE             
## 

Find one other useful inference from the data that you feel Budweiser may be able to find value in

#We accomulate the number of beers by style in each state
df_sum_1 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n())
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
df_sum_1 = mutate(df_sum_1,flavor_type = ifelse(str_detect(Style,regex("IPA",ignore_case = TRUE)),"Ipa","Pale"))
df_sum_1 = mutate(df_sum_1,general_stype = ifelse(str_detect(Style,regex("lager",ignore_case = TRUE)),"Lager","Ale"))

#We accomulate the number of beers by style but only the most popular(Beer styles with more than five beers)
df_sum_2 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n()) %>% filter(count_style>5) %>% mutate(Name_State_2=str_trim(str_to_lower(Name_State))) 
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
df_sum_3 = group_by(df_beerbre_unp,State,Name_State) %>% summarize(count_style = n()) %>% mutate(Name_State_2=str_trim(str_to_lower(Name_State))) 
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
#We accomulate by Style and in addition create a column with the attribute Lager
df_sum_4 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n()) %>%  mutate(Name_State_2=str_trim(str_to_lower(Name_State)),Lager=ifelse(str_detect(Style,regex("lager",ignore_case = TRUE)),"X","")) %>%
  filter(count_style>5)
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
#Plotting the Popular Beer Style by State x= Style, Y = Count of beer , color = Name of the State
gfr = df_sum_2 %>% arrange(Name_State) %>% ggplot(aes(x=Style,y=count_style,fill = Name_State ))+geom_bar(stat = "identity",position = "stack")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = "left",text = element_text(size = 10))+labs(title = "Most popular beer style in USA by State",subtitle = "Style with a very few beers are not shown in this analysis")+xlab("USA State")+ylab("Number of Beers by Style")
ggplotly(gfr)
gfr = df_sum_4 %>% arrange(Name_State) %>% ggplot(aes(x=Name_State,y=count_style,fill = Lager))+geom_bar(stat = "identity",position = "stack")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = "left",text = element_text(size = 10))


#Dataframe Map by State
df_usa_states_1 = map_data("state")
df_usa_states_2 =  map_data("state")


#We calculate the mean of every state in order to locate the Label in the center
df_usa_states_3 = df_usa_states_1 %>% group_by(region) %>% summarize(lat=mean(lat),long=mean(long)) %>% mutate(AB_State = str_to_upper(str_sub(region,1,2)))

#Merge of the beer data and the coordinates and locations
df_state_beer_1 = merge(df_sum_2,df_usa_states_1,by.x = "Name_State_2",by.y = "region")
df_state_beer_2 = merge(df_usa_states_1,df_sum_3,by.x="region",by.y = "Name_State_2")
df_state_beer_3 = merge(df_usa_states_3,df_sum_3,by.x="region",by.y = "Name_State_2") 


#Plotting the MAP and The label of each state
map = df_usa_states_1 %>% ggplot()+geom_polygon(aes(x=long, y=lat, group = group,fill=region))+
      theme(legend.position = "none")+ggrepel::geom_label_repel(aes(x=long,   y=lat,label=sprintf("%s,Beers:%i",Name_State,count_style),alpha=0.8),size=2,data=df_state_beer_3)+
  labs(title = "Number of Beer Types by States in USA",subtitle = "Assorment of Beer Types by State in the USA")+xlab("Latitude")+ylab("Longitude")

map
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps